###### Text Preparation ##### #-------------------------------------------#

# This script prepares the event descriptions in ACLED for topic modelling


#------------- Load and prepare ACLED data --------------------------------------------

# load notes for all ACLED protest events 
all_protests_sub <- rio::import("data/acled_subset.rda") %>% 
  filter(year > 2017) %>% # subset to relevant years
  arrange(country, year) %>% # sort by country and year
  mutate(event_date = as.Date(event_date, "%d %B %Y"),
         year_week = yearweek(event_date),
         pandemic = case_when(event_date < as.Date("2020-03-15", origin = "%Y-%m-%d") ~ "pre",
                              T ~ "during"),
         data_id = as.character(data_id),
         corona_topic = case_when(str_detect(notes, "corona*|pandemic*|covid*") ~ 1,
                                  T ~ 0)) %>%  # identify Covid-related protests
  select(data_id, event_date, country, year, year_week, notes, pandemic, corona_topic)  # subset to relevant variables

save(all_protests_sub, file = "output/all_protests_sub.Rda")

# subset to corona protests
corona_protests <- all_protests_sub %>% filter(corona_topic == 1) %>% 
  filter(!str_detect(notes, "Fridays for")) %>% 
  filter(!str_detect(notes, "Fridays For")) # Remove FFF protests that were unrelated to pandemic

#------------- Tokenize ACLED data --------------------------------------------

# trim white spaces
corona_protests$notes_clean <- stri_trim(corona_protests$notes)

# Load the language annotation model from the udpipe package
# Note that this line has been executed and the model is included in the 
# replication material (see folder "data". Therefore, it is commented out.

#ud_model <- udpipe_download_model(language = "english")

# load english tokenization model
ud_model <- udpipe_load_model("data/english-ewt-ud-2.5-191206.udpipe")

# anotate event notes
x <- udpipe_annotate(ud_model, x = corona_protests$notes_clean,
                     doc_id = corona_protests$data_id, trace = 100) # store in data frame
x <- as.data.frame(x)
#str(x)

# Save tokens as R data
save(x, file = "output/acled_tokenized_corona.Rda")

# load tokens
#load("output/acled_tokenized_corona.Rda")

#### 

#remove punctuation, limit to adjectives and nouns
x_sub <- x %>% 
  filter(upos %in% c("ADJ", "NOUN"))

# convert all tokens to lowercase
x_sub$lemma <- stri_trans_tolower(x_sub$lemma)

# kick out 	[size=no report], a common feature of ACLED event notes
x_sub <- x_sub %>% 
  filter(!grepl("[size=", sentence, fixed=TRUE)) 

#subset to relevant variables, doc id and lemma
x_sub <- x_sub[, c("doc_id", "lemma")] 

# Remove a series of common but uninformative lemmas
x_sub <- subset(x_sub, !(lemma %in% c("protest", "protesters", "protester", "protestor",
                                      "rally", "group", "members", "member", 
                                      "size", "city", "other", "square", "tr",
                                      "resident", "residents", "headquarters",
                                      "activists", "activist", "th", "march", "several",
                                      "support", "supporters", "supporter", "more",
                                      "town", "state", "part", "demand", "sit", "conference",
                                      "people", "part", "front", "vigil", "strike",
                                      "demonstration", "district",  "ten", "rd", "st",
                                      "demonstrators", "demonstrator", "year",  "thousand",
                                      "dozen", "hundred", "various", "street", "students", 
                                      "dozens", "hundreds", "thousands", "Sukkur", "day",
                                      "number",  "kctu", "pkk", "in", "related",
                                      "government", "pandemic", "coronavirus",
                                      "police", "press", "banner")))


# concatenate all lemmas by event description
x_sub_corpus <-  x_sub %>% 
  group_by(doc_id) %>% 
  summarise(lemmas = paste(lemma, collapse=" "))

# build a corpus using the Quanteda package
corpus <- corpus(x_sub_corpus, text_field = "lemmas")
print(corpus)

# tokenize lemmas, remove punctuation
toks_nopunct <- tokens(corpus, remove_punct = TRUE)
print(toks_nopunct)

# Convert tokens to data frame

# function to convert tokens to dataframe
as.data.frame.tokens <- function(x) {
  data.frame(
    doc_id = rep(names(x), lengths(x)),
    tokens = unlist(x, use.names = FALSE)
  )
}

# Convert to dataframe
acled_dat <- as.data.frame.tokens(toks_nopunct)

# Save as R data
save(acled_dat, file = "output/acled_tokens_final.Rda")
